Global:
  device: gpu
  seed: 1024

  global_batch_size: 
  local_batch_size: 1
  micro_batch_size: 1


Engine:
  max_steps: 500000
  num_train_epochs: 1
  accumulate_steps:
  logging_freq: 1
  eval_freq: 500
  eval_iters: 10
  test_iters:
  mix_precision:
    enable: True
    dtype: "float16"
    level: "o2"
    scale_loss: 32768.0
    custom_black_list: ["reduce_sum", "c_softmax_with_cross_entropy", "elementwise_div"]
    custom_white_list: ["lookup_table", "lookup_table_v2"]
  save_load:
    output_dir: ./output
    ckpt_dir:


Model:
  module: "GPTModuleAuto"
  name: "GPT"
  vocab_size_divisible_unit: 128
  fuse_attn_qkv: True
  scale_qk_by_layer_num: True
  fused_softmax_with_triangular: True
  use_flash_attn: False
  sequence_parallel: False # TODO make sequence_parallel as an independent parallel and be set id Distributed

Data:
  Train:
    collate_fn: gpt_collate_fn
    sample_split: 2
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [969, 30, 1]
      max_seq_len: 1024

  Eval:
    collate_fn: gpt_collate_fn
    sample_split: 2
    dataset:
      name: GPTDataset
      input_dir: ./data/
      split: [969, 30, 1]
      max_seq_len: 1024


Optimizer:
  name: AdamW
  weight_decay: 0.01
  beta1: 0.9
  beta2: 0.999
  epsilon: 1.0e-8
  lr:
    name: CosineAnnealingWithWarmupDecay
    decay_steps: 360000
    warmup_rate: 0.01
    max_lr: 5.0e-5
    min_lr: 1.0e-5
    use_increments: True
  grad_clip:
    name: "ClipGradByGlobalNorm"
    clip_norm: 1.0


FusedPasses:
  fused_linear: False
  fused_adamw: False
